{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "[['the', 'sky', 'is', 'blue'], ['sky', 'is', 'blue', 'and', 'sky', 'is', 'beautiful'], ['the', 'beautiful', 'sky', 'is', 'so', 'blue'], ['i', 'love', 'blue', 'cheese']]\n[['loving', 'this', 'blue', 'sky', 'today']]\n[[ 0.00927303  0.02162506 -0.01460403 -0.00401924  0.00250394 -0.00629804\n  -0.00401768 -0.00306395  0.0192884  -0.01135796]\n [ 0.00364845  0.01165146 -0.00032804 -0.01386576  0.00595418  0.00139123\n  -0.01364638  0.01582027  0.0173904  -0.00911294]\n [-0.00072099  0.02206395 -0.00713337 -0.00304041  0.00782759 -0.00203281\n  -0.00206481  0.00022724  0.0195482  -0.01270212]\n [ 0.00806398  0.04998796 -0.03547955  0.01987278 -0.00656114  0.03626325\n   0.02452678 -0.03793738 -0.02828991  0.01549027]]\n[[-5.14067942e-03  3.62036284e-02 -6.71307649e-03  5.05944155e-03\n   4.71267011e-03 -7.53486529e-04 -1.14511903e-02 -1.03723258e-04\n   3.48817557e-05 -1.34660294e-02]]\n"
    }
   ],
   "source": [
    "import gensim\n",
    "import nltk\n",
    "import numpy as np\n",
    "\n",
    "#自制语料\n",
    "CORPUS = [\n",
    "'the sky is blue',\n",
    "'sky is blue and sky is beautiful',\n",
    "'the beautiful sky is so blue',\n",
    "'i love blue cheese'\n",
    "]\n",
    "\n",
    "new_doc = ['loving this blue sky today']\n",
    "#tokenize corpus\n",
    "TOKENIZED_CORPUS=[nltk.word_tokenize(sentence) for sentence in CORPUS]\n",
    "tokenized_new_doc=[nltk.word_tokenize(sentence) for sentence in new_doc]\n",
    "print(TOKENIZED_CORPUS)\n",
    "print(tokenized_new_doc)\n",
    "model=gensim.models.Word2Vec(TOKENIZED_CORPUS,size=10,window=10,min_count=2,sample=1e-3)\n",
    "\n",
    "#num_features表示的文本单词大小\n",
    "def average_word_vectors(words,model,vocabulary,num_features):\n",
    "    feature_vector=np.zeros((num_features,),dtype='float64')\n",
    "    nwords=0\n",
    "    for word in words:\n",
    "        if word in vocabulary:\n",
    "            nwords=nwords+1\n",
    "            feature_vector=np.add(feature_vector,model[word])\n",
    "    if nwords:\n",
    "        feature_vector=np.divide(feature_vector,nwords)\n",
    "    return feature_vector\n",
    "\n",
    "def averaged_word_vectorizer(corpus,model,num_features):\n",
    "    #get the all vocabulary\n",
    "    vocabulary=set(model.wv.index2word)\n",
    "    features=[average_word_vectors(tokenized_sentence,model,vocabulary,num_features) for tokenized_sentence in corpus]\n",
    "    return np.array(features)\n",
    "\n",
    "avg_word_vec_features=averaged_word_vectorizer(TOKENIZED_CORPUS,model=model,num_features=10)\n",
    "print(avg_word_vec_features)\n",
    "\n",
    "nd_avg_word_vec_features=averaged_word_vectorizer(corpus=tokenized_new_doc,model=model,num_features=10)\n",
    "print(nd_avg_word_vec_features)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tfidf_wtd_avg_word_vectors(words,tfidf_vector,tfidf_vocabulary,model,num_features):\n",
    "    # print(\"tfidf_vector\", tfidf_vector)\n",
    "    # print(\"tfidf_vocabulary\", tfidf_vocabulary)\n",
    "    word_tfidfs=[tfidf_vector[0,tfidf_vocabulary.get(word)] if tfidf_vocabulary.get(word) else 0 for word in words]\n",
    "    # print(\"word_tfidfs\", word_tfidfs)\n",
    "    word_tfidf_map={word:tfidf_val for word,tfidf_val in zip(words,word_tfidfs)}\n",
    "    # print(\"word_tfidf_map\", word_tfidf_map)\n",
    "    feature_vector=np.zeros((num_features,),dtype='float64')\n",
    "    vocabulary=set(model.wv.index2word)\n",
    "    wts=0\n",
    "    for word in words:\n",
    "        if word in vocabulary:\n",
    "            word_vector=model[word]\n",
    "            weighted_word_vector=word_tfidf_map[word]*word_vector\n",
    "            wts=wts+word_tfidf_map[word]\n",
    "            feature_vector=np.add(feature_vector,weighted_word_vector)\n",
    "    if wts:\n",
    "        feature_vector=np.divide(feature_vector,wts)\n",
    "    return feature_vector\n",
    "\n",
    "def tfidf_weighted_averaged_word_vectorizer(corpus,tfidf_vectors,tfidf_vocabulary,model,num_features):\n",
    "    docs_tfidfs=[(doc,doc_tfidf) for doc,doc_tfidf in zip(corpus,tfidf_vectors)]\n",
    "    features=[tfidf_wtd_avg_word_vectors(tokenized_sentence,tfidf,tfidf_vocabulary,model,num_features) for tokenized_sentence,tfidf in docs_tfidfs]\n",
    "    return np.array(features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "and  beautiful  blue  cheese    is  love   sky    so   the\n0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60\n1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00\n2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43\n3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00\n   and  beautiful  blue  cheese   is  love   sky   so  the\n0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0\n[[ 0.00889181  0.0213254  -0.01446018 -0.00393169  0.00334673 -0.01016883\n  -0.00417116 -0.0032925   0.02317886 -0.01393146]\n [ 0.00650283  0.00621648  0.00205088 -0.01895563  0.00566368 -0.00374799\n  -0.01961181  0.02232814  0.02252023 -0.01132472]\n [-0.0026915   0.02190799 -0.00576854 -0.00280893  0.00936754 -0.00428316\n  -0.0018545   0.00060479  0.02257352 -0.01490022]\n [ 0.00806398  0.04998796 -0.03547955  0.01987278 -0.00656114  0.03626325\n   0.02452678 -0.03793738 -0.02828991  0.01549027]]\n"
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "import pandas as pd\n",
    "\n",
    "def tfidf_transformer(bow_matrix):\n",
    "    \n",
    "    transformer = TfidfTransformer(norm='l2',\n",
    "                                   smooth_idf=True,\n",
    "                                   use_idf=True)\n",
    "    tfidf_matrix = transformer.fit_transform(bow_matrix)\n",
    "    return transformer, tfidf_matrix\n",
    "\n",
    "def tfidf_extractor(corpus, ngram_range=(1,1)):\n",
    "    \n",
    "    vectorizer = TfidfVectorizer(min_df=1, \n",
    "                                 norm='l2',\n",
    "                                 smooth_idf=True,\n",
    "                                 use_idf=True,\n",
    "                                 ngram_range=ngram_range)\n",
    "    features = vectorizer.fit_transform(corpus)\n",
    "    return vectorizer, features\n",
    "\n",
    "def bow_extractor(corpus, ngram_range=(1,1)):\n",
    "    #min_df为1说明文档中词频最小为1也会被考虑\n",
    "    #ngram_range可以设置(1,3)将建立包括所有unigram、bigram、trigram的向量空间\n",
    "    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)\n",
    "    features = vectorizer.fit_transform(corpus)\n",
    "    return vectorizer, features\n",
    "\n",
    "\n",
    "def display_features(features, feature_names):\n",
    "    df = pd.DataFrame(data=features,\n",
    "                      columns=feature_names)\n",
    "    print(df)\n",
    "\n",
    "bow_vectorizer, bow_features = bow_extractor(CORPUS)\n",
    "feature_names = bow_vectorizer.get_feature_names()\n",
    "tfidf_trans, tfidf_features = tfidf_transformer(bow_features)\n",
    "tfidf_vectorizer, tdidf_features = tfidf_extractor(CORPUS)\n",
    "display_features(np.round(tdidf_features.todense(), 2), feature_names)\n",
    "nd_tfidf = tfidf_vectorizer.transform(new_doc)\n",
    "display_features(np.round(nd_tfidf.todense(), 2), feature_names)\n",
    "\n",
    "corpus_tfidf=tfidf_features\n",
    "vocab=tfidf_vectorizer.vocabulary_\n",
    "\n",
    "wt_tfidf_word_vec_features=tfidf_weighted_averaged_word_vectorizer(corpus=TOKENIZED_CORPUS,tfidf_vectors=corpus_tfidf,tfidf_vocabulary=vocab,model=model,num_features=10)\n",
    "\n",
    "print(wt_tfidf_word_vec_features)"
   ]
  }
 ]
}